{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DRKG Relation Similarity Analysis based on link recommendations\n",
    "\n",
    "This notebook performs an similarity analysis of different link types in the DRKG based on their recommendation outcome. Speciffically, for a certain node we predict the K most similar neighbors for a certain link type. Then we repeat this prediction for all link types. Link types that have a significant overlap of predicted neighbors will be more similar."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import csv\n",
    "import sys\n",
    "import torch as th\n",
    "sys.path.insert(1, '../utils')\n",
    "from utils import download_and_extract\n",
    "download_and_extract()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define the function used for scoring the edges. This should cooincide with the function used to learn the embeddinds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transE_l2(head, rel, tail):\n",
    "    gamma=12.0\n",
    "    score = head + rel - tail\n",
    "    return gamma - th.norm(score, p=2, dim=-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading Mapping files\n",
    "\n",
    "Load the mapping files that give the ids used by the embedding models for the corresponding DRKG id.   Load the entity and relation embeddings as well. Change input files and embedding files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "97238\n",
      "107\n"
     ]
    }
   ],
   "source": [
    "# folders holding data\n",
    "folder_with_training_part='../data/drkg/embed/'\n",
    "folder_with_embeddings='../data/drkg/embed/'\n",
    "\n",
    "ids = []\n",
    "entity2id = {}\n",
    "with open(folder_with_training_part+\"entities.tsv\", newline='', encoding='utf-8') as csvfile:\n",
    "    reader = csv.DictReader(csvfile, delimiter='\\t', fieldnames=[ 'entity','id'])\n",
    "    for row_val in reader:\n",
    "        id = row_val['id']\n",
    "\n",
    "        entity2id[row_val['entity']] = int(id)\n",
    "\n",
    "print(len(entity2id))\n",
    "\n",
    "rel2id = {}\n",
    "with open(folder_with_training_part+\"relations.tsv\", newline='', encoding='utf-8') as csvfile:\n",
    "    reader = csv.DictReader(csvfile, delimiter='\\t', fieldnames=['entity','id'])\n",
    "    for row_val in reader:\n",
    "        id = row_val['id']\n",
    "\n",
    "        rel2id[row_val['entity']] = int(id)\n",
    "\n",
    "print(len(rel2id))\n",
    "\n",
    "node_emb = np.load(folder_with_embeddings+'DRKG_TransE_l2_entity.npy')\n",
    "rel_emb = np.load(folder_with_embeddings+'DRKG_TransE_l2_relation.npy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading triplets\n",
    "\n",
    "Load triplets and map them to DRKG id space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "head_ids = []\n",
    "rel_ids = []\n",
    "tail_ids = []\n",
    "p0_rows = []\n",
    "folder_with_training_part\n",
    "with open(\"../data/drkg/drkg.tsv\", newline='', encoding='utf-8') as csvfile:\n",
    "    reader = csv.DictReader(csvfile, delimiter='\\t', fieldnames=['head', 'rel', 'tail'])\n",
    "    for row_val in reader:\n",
    "        head = row_val['head']\n",
    "        rel = row_val['rel']\n",
    "        tail = row_val['tail']\n",
    "\n",
    "        head_id = entity2id[head]\n",
    "        rel_id = rel2id[rel]\n",
    "        tail_id = entity2id[tail]\n",
    "        \n",
    "        head_ids.append(head_id)\n",
    "        rel_ids.append(rel_id)\n",
    "        tail_ids.append(tail_id)\n",
    "        p0_rows.append((head, rel, tail))\n",
    "        \n",
    "head_ids = np.array(head_ids)\n",
    "rel_ids = np.array(rel_ids)\n",
    "tail_ids = np.array(tail_ids)\n",
    "triple_ids = np.arange(head_ids.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Link prediction\n",
    "\n",
    "Specify number of seed nodes to select for link prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores={}\n",
    "L=100\n",
    "device = th.device('cpu')\n",
    "with th.no_grad():\n",
    "    node_emb = th.tensor(node_emb).to(device)\n",
    "    rel_emb = th.tensor(rel_emb).to(device)\n",
    "    head_ids = th.tensor(head_ids).to(device)\n",
    "    rel_ids = th.tensor(rel_ids).to(device)\n",
    "    tail_ids = th.tensor(tail_ids).to(device)\n",
    "\n",
    "    head_embedding = node_emb[head_ids]\n",
    "    rel_embedding = rel_emb[rel_ids]\n",
    "    tail_embedding = node_emb[tail_ids]\n",
    "    # select L random heads\n",
    "    \n",
    "    perm = th.randperm(head_ids.shape[0])\n",
    "    seeds = head_ids[perm[:L]]\n",
    "    seed_heads = node_emb[seeds]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predict the scores per link type among the selected seed nodes and all other nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for rel in rel2id.keys():\n",
    "        rel_id=rel2id[rel]\n",
    "        rel_embedding=((rel_emb[rel_id]).repeat(node_emb.shape[0],1))\n",
    "        #print(rel_embedding.shape)\n",
    "        scores[rel] =[transE_l2((seed_heads[i].repeat(node_emb.shape[0],1)), rel_embedding, node_emb) for i in range(seed_heads.shape[0])]# for i in range()]\n",
    "        #print(scores[rel])\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top K link predicition\n",
    "Specify the number of top scoring neighbors to evaluate the proposed ovelap of link prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "K=10\n",
    "top_neighbors={}\n",
    "for rel in scores.keys():\n",
    "    top_neighbors[rel]=[th.argsort(score, descending=True)[:K] for score in scores[rel]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overlap among predicted neighbors\n",
    "Calculate the overlap of predicted neighboring nodes for each per of relation types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "overlap_of_predicted_neighbors=[]\n",
    "keys=list(scores.keys()) \n",
    "for i in range(len(keys)):\n",
    "    for j in range(i+1,len(keys)):\n",
    "        e1=keys[i]\n",
    "        e2=keys[j]\n",
    "        n_1=top_neighbors[e1]\n",
    "        n_2=top_neighbors[e2]\n",
    "        jacard=0\n",
    "        for l in range(len(n_1)):\n",
    "            n1=list(n_1[l].cpu().numpy())\n",
    "            n2=list(n_2[l].cpu().numpy())\n",
    "            jacard+=float(len(set(n1).intersection(set(n2)))/len(set(n1).union(set(n2))))\n",
    "        jacard=jacard/len(n_1)\n",
    "        overlap_of_predicted_neighbors.append([e1,e2,jacard])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Store sorted overlap file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "overlap_of_predicted_neighbors_sort=(sorted(overlap_of_predicted_neighbors,key=lambda x: float(x[2])))[::-1]\n",
    "\n",
    "overlap_of_predicted_neighbors_store=[\"{}\\t{}\\t{}\\n\".format(j[0], j[1], j[2]) for j in overlap_of_predicted_neighbors_sort]\n",
    "\n",
    "overlap_of_predicted_neighbors_store=[\"edge_type1\\tedge_type2\\tpercentage of overlapping predicted edges\\n\"]+overlap_of_predicted_neighbors_store\n",
    "entity_file = \"percentage_of_overlapping_predicted_edges_per_edge_pair\"+str(K)+\"v1.tsv\"\n",
    "with open(entity_file, 'w+') as f:\n",
    "    f.writelines(overlap_of_predicted_neighbors_store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
